require(boot)                                                   # library boot includes a function cv.glm doing Cross Validation for a glm object

#### ------------------ simulating different errors --------------------------------
nX=20                                                           # number of observations
X=array(1,dim=c(nX,6))                                          # intercept + 5 variables
X[,-1]=rnorm(nX*5,0,1)                                          # predictors
beta.v=c(.5,1,-1,2,-2,0.1)                                      # regression coefficients

sigma0=3                                                        # variance of response variable Y

# first estimating the training error
Y=rnorm(nX,mean=X%*%beta.v,sd=sigma0)                                  # training set
X.dat=as.data.frame(X); X.dat=cbind(X.dat,Y)
Y.lm=lm(Y~V2+V3+V4+V5+V6,data=X.dat)                                   # V1=intercept removed

n.rep=10000
Y.eval=rnorm(nX*n.rep,mean=rep(X%*%beta.v,times=n.rep),sd=sigma0)      # evaluation data
sum((Y.eval-rep(predict(Y.lm),times=n.rep))^2)/(nX*n.rep)              # prediction error
sum((Y-predict(Y.lm))^2)/(nX)                                          # training error

Y.glm=glm(Y~V2+V3+V4+V5+V6,data=X.dat)                                 #  using cross validation
cv.err <- cv.glm(X.dat,Y.glm)                                          # leave one out cross validation
cv.err.5 <- cv.glm(X.dat, Y.glm, K=5)                                  # 5-fold cross validation
cv.err$delta                                                           # output includes the raw estimates and one bias corrected
cv.err.5$delta                                                         # one estimate, based on one partition in 5 folds

# cross validation - manually; to illustrate the process, not as efficient as eg cv.glm
# first leave-one out
LOO.tr=numeric(nX)
for (i in 1:nX){
 Y.lm.LOO=lm(Y~V2+V3+V4+V5+V6,data=X.dat[-i,])                                   # V1=intercept removed from data
 LOO.tr[i]=(Y[i]-predict(Y.lm.LOO,newdata=X.dat[i,]))^2
}
summary(LOO.tr)

# second 5-fold cross validation
CV5.tr=numeric(5)
ind.CV=split(sample(1:nX,replace=F),f=rep(1:5,each=(nX/5)))                                # create a list with indices defining the partition in 5 folds
for (i in 1:5){
 Y.lm.cv5=lm(Y~V2+V3+V4+V5+V6,data=X.dat[-ind.CV[[i]],])                                   # V1=intercept removed
 CV5.tr[i]=sum((Y[ind.CV[[i]]]-predict(Y.lm.cv5,newdata=X.dat[ind.CV[[i]],]))^2)
}
summary(CV5.tr)

# Estimating the expected error (over different training sets)
n.sim=200                                                                # number of training sets
train.err=pred.err=numeric(n.sim)
err.LOO=err.CV5=array(0,dim=c(n.sim,2))

for (i in 1:n.sim) {
 Y=rnorm(nX,mean=X%*%beta.v,sd=sigma0)                                   # training set
 X.dat=as.data.frame(X); X.dat=cbind(X.dat,Y)
 Y.lm=lm(Y~V2+V3+V4+V5+V6,data=X.dat)
 Y.eval=rnorm(nX*n.rep,mean=rep(X%*%beta.v,times=n.rep),sd=sigma0)       # evaluation data
 pred.err[i]=sum((Y.eval-rep(predict(Y.lm),times=n.rep))^2)/(nX*n.rep)   # prediction error
 train.err[i]=sum((Y-predict(Y.lm))^2)/(nX)
 Y.glm=glm(Y~V2+V3+V4+V5+V6,data=X.dat)                                  # using cross validation
 cv.err <- cv.glm(X.dat,Y.glm)                                           # leave one out cross validation
 cv.err.5 <- cv.glm(X.dat, Y.glm, K=5)                                   # 5-fold cross validation
 err.LOO[i,]=cv.err$delta
 err.CV5[i,]=cv.err.5$delta
}
 
summary(pred.err)
summary(train.err)
summary(err.LOO)
summary(err.CV5)

# theoretical value (eg Hastie et al 09 p. 224)
sigma0^2+(6/nX)*sigma0^2 

# same calculations but using a simpler model than the true model generating the data
# a model simpler than the truth might be better in terms of prediction because its sampling variance is small compared to the true model
# we consider two alternatives, ignoring important covariates V4 and V5, which is too simple and lead to large biases, 
# and ignoring the unimportant V6, which leads to some bias compensated by lower variability (and therefore lower MSE)

Y=rnorm(nX,mean=X%*%beta.v,sd=sigma0)                                  # training set
X.dat=as.data.frame(X); X.dat=cbind(X.dat,Y)
Y.lm.s1=lm(Y~V2+V3+V4+V5,data=X.dat)                                    # first "simple" model: V6 removed
Y.lm.s2=lm(Y~V2+V3,data=X.dat)                                          # second "simple" model: V4, V5 and V6 removed

n.rep=10000
Y.eval=rnorm(nX*n.rep,mean=rep(X%*%beta.v,times=n.rep),sd=sigma0)       # evaluation data to calculate the error
sum((Y.eval-rep(predict(Y.lm.s1),times=n.rep))^2)/(nX*n.rep)            # prediction error for model s1
sum((Y.eval-rep(predict(Y.lm.s2),times=n.rep))^2)/(nX*n.rep)            # prediction error for model s2
sum((Y-predict(Y.lm.s1))^2)/(nX)                                        # training error of s1 calculated on the sample (optimistic, ie too low)
sum((Y-predict(Y.lm.s2))^2)/(nX)                                        # training error of s2

# Calculating predicted values for n.sim training sets and the three models (full=true + two simple models s1 and s2)
# bias is based on comparing average predictions to true predictions
# and variance is the variance of predictions; the overall measure is MSE = bias^2 + variance
n.sim=100                                                                 # number of training sets
pred.tr=pred.tr1=pred.tr2=array(0,dim=c(n.sim,nX))
for (i in 1:n.sim) {
 Y=rnorm(nX,mean=X%*%beta.v,sd=sigma0)                                    # training set
 X.dat=as.data.frame(X); X.dat=cbind(X.dat,Y)
 Y.lm=lm(Y~V2+V3+V4+V5+V6,data=X.dat)
 Y.lm.s1=lm(Y~V2+V3+V4+V5,data=X.dat)
 Y.lm.s2=lm(Y~V2+V3,data=X.dat)
 pred.tr[i,]=predict(Y.lm)
 pred.tr1[i,]=predict(Y.lm.s1)
 pred.tr2[i,]=predict(Y.lm.s2)
}

apply(pred.tr,MAR=2,mean)                                                # average predictions
as.vector(X%*%beta.v)                                                    # true value of predictions
apply(pred.tr,MAR=2,mean) - as.vector(X%*%beta.v)                        # bias for true model, theoretically 0
apply(pred.tr1,MAR=2,mean) - as.vector(X%*%beta.v)                       # bias for simple model removing the unimportant variable
apply(pred.tr2,MAR=2,mean) - as.vector(X%*%beta.v)                       # bias for simple model removing two important var and the unimportant variable

apply(pred.tr,MAR=2,var)                                                 # variance for true model, theoretically given by x0 and H matrix, see eg Hastie et al 09
apply(pred.tr1,MAR=2,var)                                                # variance for simple model removing the unimportant variable
apply(pred.tr2,MAR=2,var)                                                # variance for simple model removing two important var and the unimportant variable

par(mfrow=c(3,1))                                                                             # plotting the bias, variance and MSE for the 20 predicted values (ie the 20 sample units) 
plot(1:20,(apply(pred.tr2,MAR=2,mean) - as.vector(X%*%beta.v))^2,pch=19)                      # simple model is biased but precise (low variance)
points(1:20,(apply(pred.tr1,MAR=2,mean) - as.vector(X%*%beta.v))^2,col="blue",pch=19)         # full model is unbiased but has relatively low precision
points(1:20+.1,(apply(pred.tr,MAR=2,mean) - as.vector(X%*%beta.v))^2,col="green",pch=19)
abline(0,0)
title(main="BIAS SQUARED")

plot(1:20,apply(pred.tr1,MAR=2,var),col="blue",pch=19,ylim=c(0,10))
points(1:20+.1,apply(pred.tr2,MAR=2,var),pch=19)
points(1:20+.2,apply(pred.tr,MAR=2,var),col="green",pch=19)
title(main="VARIANCE")

sumsq=function(x) sum(x^2)/length(x)                                                                             # function calculating the average sum of squares, used to calculate directly the MSE
plot(1:20,apply(sweep(pred.tr2,MAR=2,FUN="-",STATS=as.vector(X%*%beta.v)),MAR=2,FUN=sumsq),pch=19)
points(1:20,apply(sweep(pred.tr1,MAR=2,FUN="-",STATS=as.vector(X%*%beta.v)),MAR=2,FUN=sumsq),pch=19,col="blue")
points(1:20,apply(sweep(pred.tr,MAR=2,FUN="-",STATS=as.vector(X%*%beta.v)),MAR=2,FUN=sumsq),pch=19,col="green")
title(main="MSE")

sum(apply(sweep(pred.tr,MAR=2,FUN="-",STATS=as.vector(X%*%beta.v)),MAR=2,FUN=sumsq))
sum(apply(sweep(pred.tr1,MAR=2,FUN="-",STATS=as.vector(X%*%beta.v)),MAR=2,FUN=sumsq))
sum(apply(sweep(pred.tr2,MAR=2,FUN="-",STATS=as.vector(X%*%beta.v)),MAR=2,FUN=sumsq))